/*	$OpenBSD: rf_evenodd_dagfuncs.c,v 1.6 2000/08/08 16:07:41 peter Exp $	*/
/*	$NetBSD: rf_evenodd_dagfuncs.c,v 1.6 2000/03/30 12:45:40 augustss Exp $	*/

/*
 * Copyright (c) 1995 Carnegie-Mellon University.
 * All rights reserved.
 *
 * Author: ChangMing Wu
 *
 * Permission to use, copy, modify and distribute this software and
 * its documentation is hereby granted, provided that both the copyright
 * notice and this permission notice appear in all copies of the
 * software, derivative works or modified versions, and any portions
 * thereof, and that both notices appear in supporting documentation.
 *
 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
 *
 * Carnegie Mellon requests users of this software to return to
 *
 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
 *  School of Computer Science
 *  Carnegie Mellon University
 *  Pittsburgh PA 15213-3890
 *
 * any improvements or extensions that they make and grant Carnegie the
 * rights to redistribute these changes.
 */

/*
 * Code for RAID-EVENODD architecture.
 */

#include "rf_types.h"
#include "rf_raid.h"
#include "rf_dag.h"
#include "rf_dagffrd.h"
#include "rf_dagffwr.h"
#include "rf_dagdegrd.h"
#include "rf_dagdegwr.h"
#include "rf_dagutils.h"
#include "rf_dagfuncs.h"
#include "rf_etimer.h"
#include "rf_general.h"
#include "rf_configure.h"
#include "rf_parityscan.h"
#include "rf_evenodd.h"
#include "rf_evenodd_dagfuncs.h"

/* These redundant functions are for small write. */
RF_RedFuncs_t rf_EOSmallWritePFuncs = {
	rf_RegularXorFunc, "Regular Old-New P",
	rf_SimpleXorFunc, "Simple Old-New P"
};
RF_RedFuncs_t rf_EOSmallWriteEFuncs = {
	rf_RegularONEFunc, "Regular Old-New E",
	rf_SimpleONEFunc, "Regular Old-New E"
};
/* These redundant functions are for degraded read. */
RF_RedFuncs_t rf_eoPRecoveryFuncs = {
	rf_RecoveryXorFunc, "Recovery Xr",
	rf_RecoveryXorFunc, "Recovery Xr"
};
RF_RedFuncs_t rf_eoERecoveryFuncs = {
	rf_RecoveryEFunc, "Recovery E Func",
	rf_RecoveryEFunc, "Recovery E Func"
};


/*****************************************************************************
 *   The following encoding node functions is used in
 *   EO_000_CreateLargeWriteDAG.
 *****************************************************************************/
int
rf_RegularPEFunc(RF_DagNode_t *node)
{
	rf_RegularESubroutine(node, node->results[1]);
	rf_RegularXorFunc(node);	/* Do the wakeup here ! */
#if 1
	return (0);		/* XXX This was missing... GO */
#endif
}


/*****************************************************************************
 *  For EO_001_CreateSmallWriteDAG, there are (i) RegularONEFunc() and
 *  (ii) SimpleONEFunc() to be used. The previous case is when write accesses
 *  at least sectors of full stripe unit.
 *  The later function is used when the write accesses two stripe units but
 *  with total sectors less than sectors per SU. In this case, the access of
 *  parity and 'E' are shown as disconnected areas in their stripe unit and
 *  parity write and 'E' write are both divided into two distinct writes
 *  (totally four). This simple old-new write and regular old-new write happen
 *  as in RAID-5.
 *****************************************************************************/

/*
 * Algorithm:
 *   1. Store the difference of old data and new data in the Rod buffer.
 *   2. Then encode this buffer into the buffer that already have old 'E'
 *	information inside it, the result can be shown to be the new 'E'
 *	information.
 *   3. Xor the Wnd buffer into the difference buffer to recover the original
 *	old data.
 * Here we have another alternative: to allocate a temporary buffer for
 * storing the difference of old data and new data, then encode temp buf
 * into old 'E' buf to form new 'E', but this approach takes the same speed
 * as the previous, and needs more memory.
 */
int
rf_RegularONEFunc(RF_DagNode_t *node)
{
	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
	int EpdaIndex = (node->numParams - 1) / 2 - 1;	/*
							 * The parameter of node
							 * where you can find
							 * e-pda.
							 */
	int i, k, retcode = 0;
	int suoffset, length;
	RF_RowCol_t scol;
	char *srcbuf, *destbuf;
	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
	RF_Etimer_t timer;
	RF_PhysDiskAddr_t *pda, *EPDA = (RF_PhysDiskAddr_t *)
	    node->params[EpdaIndex].p;
	/* Generally zero. */
	int ESUOffset = rf_StripeUnitOffset(layoutPtr, EPDA->startSector);

	RF_ASSERT(EPDA->type == RF_PDA_TYPE_Q);
	RF_ASSERT(ESUOffset == 0);

	RF_ETIMER_START(timer);

	/*
	 * Xor the Wnd buffer into Rod buffer. The difference of old data and
	 * new data is stored in Rod buffer.
	 */
	for (k = 0; k < EpdaIndex; k += 2) {
		length = rf_RaidAddressToByte(raidPtr,
		    ((RF_PhysDiskAddr_t *) node->params[k].p)->numSector);
		retcode = rf_bxor(node->params[k + EpdaIndex + 3].p,
		    node->params[k + 1].p, length, node->dagHdr->bp);
	}
	/*
	 * Start to encode the buffer, storing the difference of old data and
	 * new data into 'E' buffer.
	 */
	for (i = 0; i < EpdaIndex; i += 2)
		if (node->params[i + 1].p != node->results[0]) {
			/* results[0] is buf ptr of E. */
			pda = (RF_PhysDiskAddr_t *) node->params[i].p;
			srcbuf = (char *) node->params[i + 1].p;
			scol = rf_EUCol(layoutPtr, pda->raidAddress);
			suoffset = rf_StripeUnitOffset(layoutPtr,
			    pda->startSector);
			destbuf = ((char *) node->results[0]) +
			    rf_RaidAddressToByte(raidPtr, suoffset);
			rf_e_encToBuf(raidPtr, scol, srcbuf,
			    RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
		}
	/*
	 * Recover the original old data to be used by parity encoding
	 * function in XorNode.
	 */
	for (k = 0; k < EpdaIndex; k += 2) {
		length = rf_RaidAddressToByte(raidPtr,
		    ((RF_PhysDiskAddr_t *) node->params[k].p)->numSector);
		retcode = rf_bxor(node->params[k + EpdaIndex + 3].p,
		    node->params[k + 1].p, length, node->dagHdr->bp);
	}
	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	tracerec->q_us += RF_ETIMER_VAL_US(timer);
	rf_GenericWakeupFunc(node, 0);
#if 1
	return (0);		/* XXX This was missing... GO */
#endif
}

int
rf_SimpleONEFunc(RF_DagNode_t *node)
{
	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
	RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
	int retcode = 0;
	char *srcbuf, *destbuf;
	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
	int length;
	RF_RowCol_t scol;
	RF_Etimer_t timer;

	RF_ASSERT(((RF_PhysDiskAddr_t *) node->params[2].p)->type ==
	    RF_PDA_TYPE_Q);
	if (node->dagHdr->status == rf_enable) {
		RF_ETIMER_START(timer);
		/* This is a pda of writeDataNodes. */
		length = rf_RaidAddressToByte(raidPtr,
		    ((RF_PhysDiskAddr_t *) node->params[4].p)->numSector);
		/* bxor to buffer of readDataNodes. */
		retcode = rf_bxor(node->params[5].p, node->params[1].p,
		    length, node->dagHdr->bp);
		/*
		 * Find out the corresponding column in encoding matrix for
		 * write column to be encoded into redundant disk 'E'.
		 */
		scol = rf_EUCol(layoutPtr, pda->raidAddress);
		srcbuf = node->params[1].p;
		destbuf = node->params[3].p;
		/* Start encoding process. */
		rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2,
		    destbuf, pda->numSector);
		rf_bxor(node->params[5].p, node->params[1].p, length,
		    node->dagHdr->bp);
		RF_ETIMER_STOP(timer);
		RF_ETIMER_EVAL(timer);
		tracerec->q_us += RF_ETIMER_VAL_US(timer);

	}
	return (rf_GenericWakeupFunc(node, retcode));	/*
							 * Call wake func
							 * explicitly since no
							 * I/O in this node.
							 */
}


/*
 * Called by rf_RegularPEFunc(node) and rf_RegularEFunc(node)
 * in f.f. large write.
 */
void
rf_RegularESubroutine(RF_DagNode_t *node, char *ebuf)
{
	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
	RF_PhysDiskAddr_t *pda;
	int i, suoffset;
	RF_RowCol_t scol;
	char *srcbuf, *destbuf;
	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
	RF_Etimer_t timer;

	RF_ETIMER_START(timer);
	for (i = 0; i < node->numParams - 2; i += 2) {
		RF_ASSERT(node->params[i + 1].p != ebuf);
		pda = (RF_PhysDiskAddr_t *) node->params[i].p;
		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
		scol = rf_EUCol(layoutPtr, pda->raidAddress);
		srcbuf = (char *) node->params[i + 1].p;
		destbuf = ebuf + rf_RaidAddressToByte(raidPtr, suoffset);
		rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2,
		    destbuf, pda->numSector);
	}
	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	tracerec->xor_us += RF_ETIMER_VAL_US(timer);
}


/*****************************************************************************
 *			 Used in  EO_001_CreateLargeWriteDAG.
 *****************************************************************************/
int
rf_RegularEFunc(RF_DagNode_t *node)
{
	rf_RegularESubroutine(node, node->results[0]);
	rf_GenericWakeupFunc(node, 0);
#if 1
	return (0);		/* XXX This was missing... GO */
#endif
}


/*****************************************************************************
 * This degraded function allow only two cases:
 *   1. When write accesses the full failed stripe unit, then the access can
 *	be more than one stripe unit.
 *   2. When write accesses only part of the failed SU, we assume accesses of
 *	more than one stripe unit are not allowed so that the write can be
 *	dealt with like a large write.
 * The following function is based on these assumptions. So except in the
 * second case, it looks the same as a large write encoding function. But
 * this is not exactly the normal way of doing a degraded write, since
 * RAIDframe has to break cases of accesses other than the above two into
 * smaller accesses. We may have to change DegrESubroutin in the future.
 *****************************************************************************/
void
rf_DegrESubroutine(RF_DagNode_t *node, char *ebuf)
{
	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
	RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
	RF_PhysDiskAddr_t *pda;
	int i, suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
	RF_RowCol_t scol;
	char *srcbuf, *destbuf;
	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
	RF_Etimer_t timer;

	RF_ETIMER_START(timer);
	for (i = 0; i < node->numParams - 2; i += 2) {
		RF_ASSERT(node->params[i + 1].p != ebuf);
		pda = (RF_PhysDiskAddr_t *) node->params[i].p;
		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
		scol = rf_EUCol(layoutPtr, pda->raidAddress);
		srcbuf = (char *) node->params[i + 1].p;
		destbuf = ebuf + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
		rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
	}

	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	tracerec->q_us += RF_ETIMER_VAL_US(timer);
}


/*****************************************************************************
 * This function is used in case where one data disk failed and both redundant
 * disks are alive. It is used in the EO_100_CreateWriteDAG. Note: if there is
 * another disk failed in the stripe but not accessed at this time, then we
 * should, instead, use the rf_EOWriteDoubleRecoveryFunc().
 *****************************************************************************/
int
rf_Degraded_100_EOFunc(RF_DagNode_t *node)
{
	rf_DegrESubroutine(node, node->results[1]);
	rf_RecoveryXorFunc(node);	/* Does the wakeup here ! */
#if 1
	return (0);		/* XXX This was missing... Should these be
				 * void functions ??? GO */
#endif
}


/*****************************************************************************
 * This function is to encode one sector in one of the data disks to the E
 * disk. However, in evenodd this function can also be used as decoding
 * function to recover data from dead disk in the case of parity failure and
 * a single data failure.
 *****************************************************************************/
void
rf_e_EncOneSect(RF_RowCol_t srcLogicCol, char *srcSecbuf,
    RF_RowCol_t destLogicCol, char *destSecbuf, int bytesPerSector)
{
	int S_index;		/*
				 * Index of the EU in the src col which need
				 * be Xored into all EUs in a dest sector.
				 */
	int numRowInEncMatrix = (RF_EO_MATRIX_DIM) - 1;
	RF_RowCol_t j, indexInDest;	/*
					 * Row index of an encoding unit in
					 * the destination column of encoding
					 * matrix.
					 */
	RF_RowCol_t indexInSrc;	/*
				 * Row index of an encoding unit in the source
				 * column used for recovery.
				 */
	int bytesPerEU = bytesPerSector / numRowInEncMatrix;

#if	RF_EO_MATRIX_DIM > 17
	int shortsPerEU = bytesPerEU / sizeof(short);
	short  *destShortBuf, *srcShortBuf1, *srcShortBuf2;
	short temp1;
#elif	RF_EO_MATRIX_DIM == 17
	int longsPerEU = bytesPerEU / sizeof(long);
	long *destLongBuf, *srcLongBuf1, *srcLongBuf2;
	long temp1;
#endif

#if	RF_EO_MATRIX_DIM > 17
	RF_ASSERT(sizeof(short) == 2 || sizeof(short) == 1);
	RF_ASSERT(bytesPerEU % sizeof(short) == 0);
#elif	RF_EO_MATRIX_DIM == 17
	RF_ASSERT(sizeof(long) == 8 || sizeof(long) == 4);
	RF_ASSERT(bytesPerEU % sizeof(long) == 0);
#endif

	S_index = rf_EO_Mod((RF_EO_MATRIX_DIM - 1 + destLogicCol - srcLogicCol), RF_EO_MATRIX_DIM);
#if	RF_EO_MATRIX_DIM > 17
	srcShortBuf1 = (short *) (srcSecbuf + S_index * bytesPerEU);
#elif	RF_EO_MATRIX_DIM == 17
	srcLongBuf1 = (long *) (srcSecbuf + S_index * bytesPerEU);
#endif

	for (indexInDest = 0; indexInDest < numRowInEncMatrix; indexInDest++) {
		indexInSrc = rf_EO_Mod((indexInDest + destLogicCol - srcLogicCol), RF_EO_MATRIX_DIM);

#if	RF_EO_MATRIX_DIM > 17
		destShortBuf = (short *) (destSecbuf + indexInDest * bytesPerEU);
		srcShortBuf2 = (short *) (srcSecbuf + indexInSrc * bytesPerEU);
		for (j = 0; j < shortsPerEU; j++) {
			temp1 = destShortBuf[j] ^ srcShortBuf1[j];
			/* Note: S_index won't be at the end row for any src
			 * col ! */
			if (indexInSrc != RF_EO_MATRIX_DIM - 1)
				destShortBuf[j] = (srcShortBuf2[j]) ^ temp1;
			/* if indexInSrc is at the end row, ie.
			 * RF_EO_MATRIX_DIM -1, then all elements are zero ! */
			else
				destShortBuf[j] = temp1;
		}

#elif	RF_EO_MATRIX_DIM == 17
		destLongBuf = (long *) (destSecbuf + indexInDest * bytesPerEU);
		srcLongBuf2 = (long *) (srcSecbuf + indexInSrc * bytesPerEU);
		for (j = 0; j < longsPerEU; j++) {
			temp1 = destLongBuf[j] ^ srcLongBuf1[j];
			if (indexInSrc != RF_EO_MATRIX_DIM - 1)
				destLongBuf[j] = (srcLongBuf2[j]) ^ temp1;
			else
				destLongBuf[j] = temp1;
		}
#endif
	}
}

void
rf_e_encToBuf(RF_Raid_t *raidPtr, RF_RowCol_t srcLogicCol, char *srcbuf,
    RF_RowCol_t destLogicCol, char *destbuf, int numSector)
{
	int i, bytesPerSector = rf_RaidAddressToByte(raidPtr, 1);

	for (i = 0; i < numSector; i++) {
		rf_e_EncOneSect(srcLogicCol, srcbuf, destLogicCol, destbuf, bytesPerSector);
		srcbuf += bytesPerSector;
		destbuf += bytesPerSector;
	}
}


/*****************************************************************************
 * when parity die and one data die, We use second redundant information, 'E',
 * to recover the data in dead disk. This function is used in the recovery node of
 * for EO_110_CreateReadDAG
 *****************************************************************************/
int
rf_RecoveryEFunc(RF_DagNode_t *node)
{
	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
	RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
	RF_RowCol_t scol;	/* source logical column */
	RF_RowCol_t fcol = rf_EUCol(layoutPtr, failedPDA->raidAddress);	/* logical column of
									 * failed SU */
	int i;
	RF_PhysDiskAddr_t *pda;
	int suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
	char *srcbuf, *destbuf;
	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
	RF_Etimer_t timer;

	bzero((char *) node->results[0], rf_RaidAddressToByte(raidPtr, failedPDA->numSector));
	if (node->dagHdr->status == rf_enable) {
		RF_ETIMER_START(timer);
		for (i = 0; i < node->numParams - 2; i += 2)
			if (node->params[i + 1].p != node->results[0]) {
				pda = (RF_PhysDiskAddr_t *) node->params[i].p;
				if (i == node->numParams - 4)
					scol = RF_EO_MATRIX_DIM - 2;	/* the colume of
									 * redundant E */
				else
					scol = rf_EUCol(layoutPtr, pda->raidAddress);
				srcbuf = (char *) node->params[i + 1].p;
				suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
				destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
				rf_e_encToBuf(raidPtr, scol, srcbuf, fcol, destbuf, pda->numSector);
			}
		RF_ETIMER_STOP(timer);
		RF_ETIMER_EVAL(timer);
		tracerec->xor_us += RF_ETIMER_VAL_US(timer);
	}
	return (rf_GenericWakeupFunc(node, 0));	/* node execute successfully */
}


/*****************************************************************************
 * This function is used in the case where one data and the parity have filed.
 * (in EO_110_CreateWriteDAG)
 *****************************************************************************/
int
rf_EO_DegradedWriteEFunc(RF_DagNode_t *node)
{
	rf_DegrESubroutine(node, node->results[0]);
	rf_GenericWakeupFunc(node, 0);
#if 1
	return (0);		/* XXX Yet another one !!! GO */
#endif
}



/*****************************************************************************
 *	THE FUNCTION IS FOR DOUBLE DEGRADED READ AND WRITE CASES.
 *****************************************************************************/

void
rf_doubleEOdecode(RF_Raid_t *raidPtr, char **rrdbuf, char **dest,
    RF_RowCol_t *fcol, char *pbuf, char *ebuf)
{
	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &(raidPtr->Layout);
	int i, j, k, f1, f2, row;
	int rrdrow, erow, count = 0;
	int bytesPerSector = rf_RaidAddressToByte(raidPtr, 1);
	int numRowInEncMatrix = (RF_EO_MATRIX_DIM) - 1;
#if 0
	int pcol = (RF_EO_MATRIX_DIM) - 1;
#endif
	int ecol = (RF_EO_MATRIX_DIM) - 2;
	int bytesPerEU = bytesPerSector / numRowInEncMatrix;
	int numDataCol = layoutPtr->numDataCol;
#if	RF_EO_MATRIX_DIM > 17
	int shortsPerEU = bytesPerEU / sizeof(short);
	short *rrdbuf_current, *pbuf_current, *ebuf_current;
	short *dest_smaller, *dest_smaller_current;
	short *dest_larger, *dest_larger_current;
	short *temp;
	short *P;

	RF_ASSERT(bytesPerEU % sizeof(short) == 0);
	RF_Malloc(P, bytesPerEU, (short *));
	RF_Malloc(temp, bytesPerEU, (short *));
#elif	RF_EO_MATRIX_DIM == 17
	int longsPerEU = bytesPerEU / sizeof(long);
	long *rrdbuf_current, *pbuf_current, *ebuf_current;
	long *dest_smaller, *dest_smaller_current;
	long *dest_larger, *dest_larger_current;
	long *temp;
	long *P;

	RF_ASSERT(bytesPerEU % sizeof(long) == 0);
	RF_Malloc(P, bytesPerEU, (long *));
	RF_Malloc(temp, bytesPerEU, (long *));
#endif
	RF_ASSERT(*((long *) dest[0]) == 0);
	RF_ASSERT(*((long *) dest[1]) == 0);
	bzero((char *) P, bytesPerEU);
	bzero((char *) temp, bytesPerEU);
	RF_ASSERT(*P == 0);
	/*
	 * Calculate the 'P' parameter, which, not parity, is the Xor of all
	 * elements in the last two column, ie. 'E' and 'parity' columns, see
	 * the Ref. paper by Blaum, et al 1993.
	 */
	for (i = 0; i < numRowInEncMatrix; i++)
		for (k = 0; k < longsPerEU; k++) {
#if	RF_EO_MATRIX_DIM > 17
			ebuf_current = ((short *) ebuf) + i * shortsPerEU + k;
			pbuf_current = ((short *) pbuf) + i * shortsPerEU + k;
#elif	RF_EO_MATRIX_DIM == 17
			ebuf_current = ((long *) ebuf) + i * longsPerEU + k;
			pbuf_current = ((long *) pbuf) + i * longsPerEU + k;
#endif
			P[k] ^= *ebuf_current;
			P[k] ^= *pbuf_current;
		}
	RF_ASSERT(fcol[0] != fcol[1]);
	if (fcol[0] < fcol[1]) {
#if	RF_EO_MATRIX_DIM > 17
		dest_smaller = (short *) (dest[0]);
		dest_larger = (short *) (dest[1]);
#elif	RF_EO_MATRIX_DIM == 17
		dest_smaller = (long *) (dest[0]);
		dest_larger = (long *) (dest[1]);
#endif
		f1 = fcol[0];
		f2 = fcol[1];
	} else {
#if	RF_EO_MATRIX_DIM > 17
		dest_smaller = (short *) (dest[1]);
		dest_larger = (short *) (dest[0]);
#elif	RF_EO_MATRIX_DIM == 17
		dest_smaller = (long *) (dest[1]);
		dest_larger = (long *) (dest[0]);
#endif
		f1 = fcol[1];
		f2 = fcol[0];
	}
	row = (RF_EO_MATRIX_DIM) - 1;
	while ((row = rf_EO_Mod((row + f1 - f2), RF_EO_MATRIX_DIM)) !=
	    ((RF_EO_MATRIX_DIM) - 1)) {
#if	RF_EO_MATRIX_DIM > 17
		dest_larger_current = dest_larger + row * shortsPerEU;
		dest_smaller_current = dest_smaller + row * shortsPerEU;
#elif	RF_EO_MATRIX_DIM == 17
		dest_larger_current = dest_larger + row * longsPerEU;
		dest_smaller_current = dest_smaller + row * longsPerEU;
#endif
		/*
		 * Do the diagonal recovery. Initially, temp[k] = (failed 1),
		 * which is the failed data in the column that has smaller
		 * col index.
		 */
		/* Step 1:  ^(SUM of nonfailed in-diagonal A(rrdrow,0..m-3)) */
		for (j = 0; j < numDataCol; j++) {
			if (j == f1 || j == f2)
				continue;
			rrdrow = rf_EO_Mod((row + f2 - j), RF_EO_MATRIX_DIM);
			if (rrdrow != (RF_EO_MATRIX_DIM) - 1) {
#if	RF_EO_MATRIX_DIM > 17
				rrdbuf_current = (short *) (rrdbuf[j]) +
				    rrdrow * shortsPerEU;
				for (k = 0; k < shortsPerEU; k++)
					temp[k] ^= *(rrdbuf_current + k);
#elif	RF_EO_MATRIX_DIM == 17
				rrdbuf_current = (long *) (rrdbuf[j]) +
				    rrdrow * longsPerEU;
				for (k = 0; k < longsPerEU; k++)
					temp[k] ^= *(rrdbuf_current + k);
#endif
			}
		}
		/*
		 * Step 2:  ^E(erow,m-2), If erow is at the bottom row, don't
		 * Xor into it.  E(erow,m-2) = (principle diagonal) ^ (failed
		 * 1) ^ (failed 2) ^ (SUM of nonfailed in-diagonal
		 * A(rrdrow,0..m-3))
		 * After this step, temp[k] = (principle diagonal) ^ (failed 2).
		 */

		erow = rf_EO_Mod((row + f2 - ecol), (RF_EO_MATRIX_DIM));
		if (erow != (RF_EO_MATRIX_DIM) - 1) {
#if	RF_EO_MATRIX_DIM > 17
			ebuf_current = (short *) ebuf + shortsPerEU * erow;
			for (k = 0; k < shortsPerEU; k++)
				temp[k] ^= *(ebuf_current + k);
#elif	RF_EO_MATRIX_DIM == 17
			ebuf_current = (long *) ebuf + longsPerEU * erow;
			for (k = 0; k < longsPerEU; k++)
				temp[k] ^= *(ebuf_current + k);
#endif
		}
		/*
		 * Step 3: ^P to obtain the failed data (failed 2). P can be
		 * proved to be actually (principal diagonal). After this
		 * step, temp[k] = (failed 2), the failed data to be recovered.
		 */
#if	RF_EO_MATRIX_DIM > 17
		for (k = 0; k < shortsPerEU; k++)
			temp[k] ^= P[k];
		/* Put the data into the destination buffer. */
		for (k = 0; k < shortsPerEU; k++)
			dest_larger_current[k] = temp[k];
#elif	RF_EO_MATRIX_DIM == 17
		for (k = 0; k < longsPerEU; k++)
			temp[k] ^= P[k];
		/* Put the data into the destination buffer. */
		for (k = 0; k < longsPerEU; k++)
			dest_larger_current[k] = temp[k];
#endif

		/* THE FOLLOWING DO THE HORIZONTAL XOR. */
		/*
		 * Step 1:  ^(SUM of A(row,0..m-3)), ie. all nonfailed data
		 * columns.
		 */
		for (j = 0; j < numDataCol; j++) {
			if (j == f1 || j == f2)
				continue;
#if	RF_EO_MATRIX_DIM > 17
			rrdbuf_current = (short *) (rrdbuf[j]) +
			    row * shortsPerEU;
			for (k = 0; k < shortsPerEU; k++)
				temp[k] ^= *(rrdbuf_current + k);
#elif	RF_EO_MATRIX_DIM == 17
			rrdbuf_current = (long *) (rrdbuf[j]) +
			    row * longsPerEU;
			for (k = 0; k < longsPerEU; k++)
				temp[k] ^= *(rrdbuf_current + k);
#endif
		}
		/* Step 2: ^A(row,m-1) */
		/* Step 3: Put the data into the destination buffer. */
#if	RF_EO_MATRIX_DIM > 17
		pbuf_current = (short *) pbuf + shortsPerEU * row;
		for (k = 0; k < shortsPerEU; k++)
			temp[k] ^= *(pbuf_current + k);
		for (k = 0; k < shortsPerEU; k++)
			dest_smaller_current[k] = temp[k];
#elif	RF_EO_MATRIX_DIM == 17
		pbuf_current = (long *) pbuf + longsPerEU * row;
		for (k = 0; k < longsPerEU; k++)
			temp[k] ^= *(pbuf_current + k);
		for (k = 0; k < longsPerEU; k++)
			dest_smaller_current[k] = temp[k];
#endif
		count++;
	}
	/*
	 * Check if all Encoding Unit in the data buffer have been decoded ?
	 * According to EvenOdd theory, if "RF_EO_MATRIX_DIM" is a prime
	 * number, this algorithm will covered all buffer.
	 */
	RF_ASSERT(count == numRowInEncMatrix);
	RF_Free((char *) P, bytesPerEU);
	RF_Free((char *) temp, bytesPerEU);
}


/*****************************************************************************
 *	This function is called by double degraded read EO_200_CreateReadDAG.
 *****************************************************************************/
int
rf_EvenOddDoubleRecoveryFunc(RF_DagNode_t *node)
{
	int ndataParam = 0;
	int np = node->numParams;
	RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *)
	    node->params[np - 1].p;
	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
	int i, prm, sector, nresults = node->numResults;
	RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
	unsigned sosAddr;
	int two = 0, mallc_one = 0, mallc_two = 0;	/*
							 * Flags to indicate if
							 * memory is allocated.
							 */
	int bytesPerSector = rf_RaidAddressToByte(raidPtr, 1);
	RF_PhysDiskAddr_t *ppda, *ppda2, *epda, *epda2, *pda, *pda0, *pda1,
	    npda;
	RF_RowCol_t fcol[2], fsuoff[2], fsuend[2],
	    numDataCol = layoutPtr->numDataCol;
	char **buf, *ebuf, *pbuf, *dest[2];
	long *suoff = NULL, *suend = NULL, *prmToCol = NULL, psuoff, esuoff;
	RF_SectorNum_t startSector, endSector;
	RF_Etimer_t timer;
	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;

	RF_ETIMER_START(timer);

	/*
	 * Find out the number of parameters that are pdas for data
	 * information.
	 */
	for (i = 0; i <= np; i++)
		if (((RF_PhysDiskAddr_t *) node->params[i].p)->type !=
		    RF_PDA_TYPE_DATA) {
			ndataParam = i;
			break;
		}
	RF_Malloc(buf, numDataCol * sizeof(char *), (char **));
	if (ndataParam != 0) {
		RF_Malloc(suoff, ndataParam * sizeof(long), (long *));
		RF_Malloc(suend, ndataParam * sizeof(long), (long *));
		RF_Malloc(prmToCol, ndataParam * sizeof(long), (long *));
	}
	if (asmap->failedPDAs[1] &&
	    (asmap->failedPDAs[1]->numSector +
	     asmap->failedPDAs[0]->numSector) < secPerSU) {
		RF_ASSERT(0);	/* Currently, no support for this situation. */
		ppda = node->params[np - 6].p;
		ppda2 = node->params[np - 5].p;
		RF_ASSERT(ppda2->type == RF_PDA_TYPE_PARITY);
		epda = node->params[np - 4].p;
		epda2 = node->params[np - 3].p;
		RF_ASSERT(epda2->type == RF_PDA_TYPE_Q);
		two = 1;
	} else {
		ppda = node->params[np - 4].p;
		epda = node->params[np - 3].p;
		psuoff = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
		esuoff = rf_StripeUnitOffset(layoutPtr, epda->startSector);
		RF_ASSERT(psuoff == esuoff);
	}
	/*
	 * The followings have three goals:
	 *   1. Determine the startSector to begin decoding and endSector
	 *	to end decoding.
	 *   2. Determine the column numbers of the two failed disks.
	 *   3. Determine the offset and end offset of the access within
	 *	each failed stripe unit.
	 */
	if (nresults == 1) {
		/* Find the startSector to begin decoding. */
		pda = node->results[0];
		bzero(pda->bufPtr, bytesPerSector * pda->numSector);
		fsuoff[0] = rf_StripeUnitOffset(layoutPtr, pda->startSector);
		fsuend[0] = fsuoff[0] + pda->numSector;
		startSector = fsuoff[0];
		endSector = fsuend[0];

		/* Find out the column of failed disk being accessed. */
		fcol[0] = rf_EUCol(layoutPtr, pda->raidAddress);

		/* Find out the other failed column not accessed. */
		sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr,
		    asmap->raidAddress);
		for (i = 0; i < numDataCol; i++) {
			npda.raidAddress = sosAddr + (i * secPerSU);
			(raidPtr->Layout.map->MapSector) (raidPtr,
			    npda.raidAddress, &(npda.row), &(npda.col),
			    &(npda.startSector), 0);
			/* Skip over dead disks. */
			if (RF_DEAD_DISK(raidPtr
			    ->Disks[npda.row][npda.col].status))
				if (i != fcol[0])
					break;
		}
		RF_ASSERT(i < numDataCol);
		fcol[1] = i;
	} else {
		RF_ASSERT(nresults == 2);
		pda0 = node->results[0];
		bzero(pda0->bufPtr, bytesPerSector * pda0->numSector);
		pda1 = node->results[1];
		bzero(pda1->bufPtr, bytesPerSector * pda1->numSector);
		/*
		 * Determine the failed column numbers of the two failed
		 * disks.
		 */
		fcol[0] = rf_EUCol(layoutPtr, pda0->raidAddress);
		fcol[1] = rf_EUCol(layoutPtr, pda1->raidAddress);
		/*
		 * Determine the offset and end offset of the access within
		 * each failed stripe unit.
		 */
		fsuoff[0] = rf_StripeUnitOffset(layoutPtr, pda0->startSector);
		fsuend[0] = fsuoff[0] + pda0->numSector;
		fsuoff[1] = rf_StripeUnitOffset(layoutPtr, pda1->startSector);
		fsuend[1] = fsuoff[1] + pda1->numSector;
		/* Determine the startSector to begin decoding. */
		startSector = RF_MIN(pda0->startSector, pda1->startSector);
		/* Determine the endSector to end decoding. */
		endSector = RF_MAX(fsuend[0], fsuend[1]);
	}
	/*
	 * Assign the beginning sector and the end sector for each parameter.
	 * Find out the corresponding column # for each parameter.
	 */
	for (prm = 0; prm < ndataParam; prm++) {
		pda = node->params[prm].p;
		suoff[prm] = rf_StripeUnitOffset(layoutPtr, pda->startSector);
		suend[prm] = suoff[prm] + pda->numSector;
		prmToCol[prm] = rf_EUCol(layoutPtr, pda->raidAddress);
	}
	/*
	 * 'sector' is the sector for the current decoding algorithm. For each
	 * sector in the failed SU
	 * 1. Find out the corresponding parameters that cover the current
	 *    sector and that are needed for the decoding of this sector in
	 *    failed SU.
	 * 2. Find out if sector is in the shadow of any accessed failed SU.
	 *    If not, malloc a temporary space of a sector in size.
	 */
	for (sector = startSector; sector < endSector; sector++) {
		if (nresults == 2)
			if (!(fsuoff[0] <= sector && sector < fsuend[0]) &&
			    !(fsuoff[1] <= sector && sector < fsuend[1]))
				continue;
		for (prm = 0; prm < ndataParam; prm++)
			if (suoff[prm] <= sector && sector < suend[prm])
				buf[(prmToCol[prm])] = ((RF_PhysDiskAddr_t *)
				    node->params[prm].p)->bufPtr +
				    rf_RaidAddressToByte(raidPtr,
				     sector - suoff[prm]);
		/*
		 * Find out if sector is in the shadow of any accessed failed
		 * SU. If yes, assign dest[0], dest[1] to point at suitable
		 * position of the buffer corresponding to failed SUs. If no,
		 * malloc a temporary space of a sector in size for
		 * destination of decoding.
		 */
		RF_ASSERT(nresults == 1 || nresults == 2);
		if (nresults == 1) {
			dest[0] = ((RF_PhysDiskAddr_t *)
			    node->results[0])->bufPtr +
			    rf_RaidAddressToByte(raidPtr, sector - fsuoff[0]);
			/* Always malloc temp buffer to dest[1]. */
			RF_Malloc(dest[1], bytesPerSector, (char *));
			bzero(dest[1], bytesPerSector);
			mallc_two = 1;
		} else {
			if (fsuoff[0] <= sector && sector < fsuend[0])
				dest[0] = ((RF_PhysDiskAddr_t *)
				    node->results[0])->bufPtr +
				    rf_RaidAddressToByte(raidPtr,
				     sector - fsuoff[0]);
			else {
				RF_Malloc(dest[0], bytesPerSector, (char *));
				bzero(dest[0], bytesPerSector);
				mallc_one = 1;
			}
			if (fsuoff[1] <= sector && sector < fsuend[1])
				dest[1] = ((RF_PhysDiskAddr_t *)
				    node->results[1])->bufPtr +
				    rf_RaidAddressToByte(raidPtr,
				     sector - fsuoff[1]);
			else {
				RF_Malloc(dest[1], bytesPerSector, (char *));
				bzero(dest[1], bytesPerSector);
				mallc_two = 1;
			}
			RF_ASSERT(mallc_one == 0 || mallc_two == 0);
		}
		pbuf = ppda->bufPtr + rf_RaidAddressToByte(raidPtr,
		    sector - psuoff);
		ebuf = epda->bufPtr + rf_RaidAddressToByte(raidPtr,
		    sector - esuoff);
		/*
		 * After finish finding all needed sectors, call doubleEOdecode
		 * function for decoding one sector to destination.
		 */
		rf_doubleEOdecode(raidPtr, buf, dest, fcol, pbuf, ebuf);
		/*
		 * Free all allocated memory, and mark flag to indicate no
		 * memory is being allocated.
		 */
		if (mallc_one == 1)
			RF_Free(dest[0], bytesPerSector);
		if (mallc_two == 1)
			RF_Free(dest[1], bytesPerSector);
		mallc_one = mallc_two = 0;
	}
	RF_Free(buf, numDataCol * sizeof(char *));
	if (ndataParam != 0) {
		RF_Free(suoff, ndataParam * sizeof(long));
		RF_Free(suend, ndataParam * sizeof(long));
		RF_Free(prmToCol, ndataParam * sizeof(long));
	}
	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	if (tracerec) {
		tracerec->q_us += RF_ETIMER_VAL_US(timer);
	}
	rf_GenericWakeupFunc(node, 0);
#if 1
	return (0);		/* XXX Is this even close !!?!?!!? GO */
#endif
}


/*
 * Currently, only access of one of the two failed SU is allowed in this
 * function. Also, asmap->numStripeUnitsAccessed is limited to be one,
 * the RAIDframe will break large access into many accesses of single
 * stripe unit.
 */

int
rf_EOWriteDoubleRecoveryFunc(RF_DagNode_t *node)
{
	int np = node->numParams;
	RF_AccessStripeMap_t *asmap =
	    (RF_AccessStripeMap_t *) node->params[np - 1].p;
	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &(raidPtr->Layout);
	RF_SectorNum_t sector;
	RF_RowCol_t col, scol;
	int prm, i, j;
	RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
	unsigned sosAddr;
	unsigned bytesPerSector = rf_RaidAddressToByte(raidPtr, 1);
	RF_int64 numbytes;
	RF_SectorNum_t startSector, endSector;
	RF_PhysDiskAddr_t *ppda, *epda, *pda, *fpda, npda;
	RF_RowCol_t fcol[2], numDataCol = layoutPtr->numDataCol;
	char **buf;		/*
				 * buf[0], buf[1], buf[2], ... etc, point to
				 * buffer storing data read from col0, col1,
				 * col2.
				 */
	char *ebuf, *pbuf, *dest[2], *olddata[2];
	RF_Etimer_t timer;
	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;

	RF_ASSERT(asmap->numDataFailed == 1);	/*
						 * Currently only support this
						 * case, the other failed SU
						 * is not being accessed.
						 */
	RF_ETIMER_START(timer);
	RF_Malloc(buf, numDataCol * sizeof(char *), (char **));

	ppda = node->results[0];	/*
					 * Instead of being buffers,
					 * node->results[0] and [1]
					 * are Ppda and Epda.
					 */
	epda = node->results[1];
	fpda = asmap->failedPDAs[0];

	/* First, recovery the failed old SU using EvenOdd double decoding. */
	/* Determine the startSector and endSector for decoding. */
	startSector = rf_StripeUnitOffset(layoutPtr, fpda->startSector);
	endSector = startSector + fpda->numSector;
	/*
	 * Assign buf[col] pointers to point to each non-failed column and
	 * initialize the pbuf and ebuf to point at the beginning of each
	 * source buffers and destination buffers. */
	for (prm = 0; prm < numDataCol - 2; prm++) {
		pda = (RF_PhysDiskAddr_t *) node->params[prm].p;
		col = rf_EUCol(layoutPtr, pda->raidAddress);
		buf[col] = pda->bufPtr;
	}
	/*
	 * pbuf and ebuf: They will change values as double recovery decoding
	 * goes on.
	 */
	pbuf = ppda->bufPtr;
	ebuf = epda->bufPtr;
	/*
	 * Find out the logical column numbers in the encoding matrix of the
	 * two failed columns.
	 */
	fcol[0] = rf_EUCol(layoutPtr, fpda->raidAddress);

	/* Find out the other failed column not accessed this time. */
	sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr,
	    asmap->raidAddress);
	for (i = 0; i < numDataCol; i++) {
		npda.raidAddress = sosAddr + (i * secPerSU);
		(raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress,
		    &(npda.row), &(npda.col), &(npda.startSector), 0);
		/* Skip over dead disks. */
		if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
			if (i != fcol[0])
				break;
	}
	RF_ASSERT(i < numDataCol);
	fcol[1] = i;
	/* Assign temporary space to put recovered failed SU. */
	numbytes = fpda->numSector * bytesPerSector;
	RF_Malloc(olddata[0], numbytes, (char *));
	RF_Malloc(olddata[1], numbytes, (char *));
	dest[0] = olddata[0];
	dest[1] = olddata[1];
	bzero(olddata[0], numbytes);
	bzero(olddata[1], numbytes);
	/*
	 * Begin the recovery decoding, initially buf[j], ebuf, pbuf, dest[j]
	 * have already pointed at the beginning of each source buffers and
	 * destination buffers.
	 */
	for (sector = startSector, i = 0; sector < endSector; sector++, i++) {
		rf_doubleEOdecode(raidPtr, buf, dest, fcol, pbuf, ebuf);
		for (j = 0; j < numDataCol; j++)
			if ((j != fcol[0]) && (j != fcol[1]))
				buf[j] += bytesPerSector;
		dest[0] += bytesPerSector;
		dest[1] += bytesPerSector;
		ebuf += bytesPerSector;
		pbuf += bytesPerSector;
	}
	/*
	 * After recovery, the buffer pointed by olddata[0] is the old failed
	 * data. With new writing data and this old data, use small write to
	 * calculate the new redundant informations.
	 */
	/*
	 * node->params[ 0, ... PDAPerDisk * (numDataCol - 2)-1 ] are Pdas of
	 * Rrd; params[ PDAPerDisk*(numDataCol - 2), ... PDAPerDisk*numDataCol
	 * -1 ] are Pdas of Rp, ( Rp2 ), Re, ( Re2 ) ; params[
	 * PDAPerDisk*numDataCol, ... PDAPerDisk*numDataCol
	 * +asmap->numStripeUnitsAccessed -asmap->numDataFailed-1] are Pdas of
	 * wudNodes; For current implementation, we assume the simplest case:
	 * asmap->numStripeUnitsAccessed == 1 and asmap->numDataFailed == 1
	 * ie. PDAPerDisk = 1 then node->params[numDataCol] must be the new
	 * data to be written to the failed disk. We first bxor the new data
	 * into the old recovered data, then do the same things as small
	 * write.
	 */

	rf_bxor(((RF_PhysDiskAddr_t *) node->params[numDataCol].p)->bufPtr,
	    olddata[0], numbytes, node->dagHdr->bp);
	/* Do new 'E' calculation. */
	/*
	 * Find out the corresponding column in encoding matrix for write
	 * column to be encoded into redundant disk 'E'.
	 */
	scol = rf_EUCol(layoutPtr, fpda->raidAddress);
	/*
	 * olddata[0] now is source buffer pointer; epda->bufPtr is the dest
	 * buffer pointer.
	 */
	rf_e_encToBuf(raidPtr, scol, olddata[0], RF_EO_MATRIX_DIM - 2,
	    epda->bufPtr, fpda->numSector);

	/* Do new 'P' calculation. */
	rf_bxor(olddata[0], ppda->bufPtr, numbytes, node->dagHdr->bp);
	/* Free the allocated buffer. */
	RF_Free(olddata[0], numbytes);
	RF_Free(olddata[1], numbytes);
	RF_Free(buf, numDataCol * sizeof(char *));

	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	if (tracerec) {
		tracerec->q_us += RF_ETIMER_VAL_US(timer);
	}
	rf_GenericWakeupFunc(node, 0);
	return (0);
}
